I found growth curves for girls and boys in Israel:
For example, see this:
I used the great online resource Web Plot Digitizer v4 to extract the data from the images files. I captured all the growth curves as best as I could. The first step now is to get interpolated versions of the digitized data. For instance, see below the 50th percentile for boys:
import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style= "ticks" , font_scale= 1.5 )
from scipy.optimize import curve_fit
from scipy.special import erf
from scipy.interpolate import UnivariateSpline
import matplotlib.animation as animation
from scipy.stats import norm
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook'
# %matplotlib widget
define useful arrays
age_list = np.round (np.arange(2.0 , 20.1 , 0.1 ), 1 )
height_list = np.round (np.arange(70 , 220 , 0.1 ), 1 )
import sample data, boys 50th percentile
df_temp_boys_50th = pd.read_csv('../archive/data/height/boys-p50.csv' , names= ['age' ,'height' ])
spline = UnivariateSpline(df_temp_boys_50th['age' ], df_temp_boys_50th['height' ], s= 0.5 )
interpolated = spline(age_list)
plot
fig, ax = plt.subplots(figsize= (10 , 6 ))
ax.plot(df_temp_boys_50th['age' ], df_temp_boys_50th['height' ], label= 'digitized data' ,
marker= 'o' , markerfacecolor= 'None' , markeredgecolor= "black" , markersize= 6 , linestyle= 'None' )
ax.plot(age_list, interpolated, label= 'interpolated' , color= "black" , linewidth= 2 )
ax.set (xlabel= 'age (years)' ,
ylabel= 'height (cm)' ,
xticks= np.arange(2 , 21 , 2 ),
title= "boys, 50th percentile"
)
ax.legend(frameon= False );
Let’s do the same for all the other curves, and then save them to a file.
interpolate all growth curves
col_names = ['p05' , 'p10' , 'p25' , 'p50' , 'p75' , 'p90' , 'p95' ]
file_names_boys = ['boys-p05.csv' , 'boys-p10.csv' , 'boys-p25.csv' , 'boys-p50.csv' ,
'boys-p75.csv' , 'boys-p90.csv' , 'boys-p95.csv' ,]
file_names_girls = ['girls-p05.csv' , 'girls-p10.csv' , 'girls-p25.csv' , 'girls-p50.csv' ,
'girls-p75.csv' , 'girls-p90.csv' , 'girls-p95.csv' ,]
# create dataframe with age column
df_boys = pd.DataFrame({'age' : age_list})
df_girls = pd.DataFrame({'age' : age_list})
# loop over file names and read in data
for i, file_name in enumerate (file_names_boys):
# read in data
df_temp = pd.read_csv('../archive/data/height/' + file_name, names= ['age' ,'height' ])
spline = UnivariateSpline(df_temp['age' ], df_temp['height' ], s= 0.5 )
df_boys[col_names[i]] = spline(age_list)
for i, file_name in enumerate (file_names_girls):
# read in data
df_temp = pd.read_csv('../archive/data/height/' + file_name, names= ['age' ,'height' ])
spline = UnivariateSpline(df_temp['age' ], df_temp['height' ], s= 0.5 )
df_girls[col_names[i]] = spline(age_list)
# make age index
df_boys.set_index('age' , inplace= True )
df_boys.index = df_boys.index.round (1 )
df_boys.to_csv('../archive/data/height/boys_height_vs_age_combined.csv' , index= True )
df_girls.set_index('age' , inplace= True )
df_girls.index = df_girls.index.round (1 )
df_girls.to_csv('../archive/data/height/girls_height_vs_age_combined.csv' , index= True )
Let’s take a look at what we just did.
age
2.0
79.269087
80.794167
83.049251
85.155597
87.475854
89.779822
90.882059
2.1
80.202106
81.772053
84.052858
86.207778
88.713405
90.883740
92.409913
2.2
81.130687
82.706754
85.011591
87.211543
89.856186
91.940642
93.416959
2.3
82.048325
83.601023
85.928399
88.170313
90.914093
92.953965
94.270653
2.4
82.948516
84.457612
86.806234
89.087509
91.897022
93.927147
95.226089
...
...
...
...
...
...
...
...
19.6
152.520938
154.812286
158.775277
163.337149
167.699533
171.531349
173.969235
19.7
152.534223
154.814440
158.791925
163.310864
167.704618
171.519600
173.980150
19.8
152.548001
154.827666
158.815071
163.275852
167.708562
171.504730
173.990964
19.9
152.562338
154.853760
158.845506
163.231563
167.711342
171.486629
174.001704
20.0
152.577300
154.894521
158.884019
163.177444
167.712936
171.465189
174.012396
181 rows × 7 columns
show all interpolated curves for girls
fig, ax = plt.subplots(figsize= (8 , 6 ))
# loop over col_names and plot each column
colors = sns.color_palette("Oranges" , len (col_names))
for col, color in zip (col_names, colors):
ax.plot(df_girls.index, df_girls[col], label= col, color= color)
ax.set (xlabel= 'age (years)' ,
ylabel= 'height (cm)' ,
xticks= np.arange(2 , 21 , 2 ),
title= "growth curves for girls \n percentile curves: 5, 10, 25, 50, 75, 90, 95" ,
);
Let’s now see the percentiles for girls age 20.
plot cdf for girls, age 20
fig, ax = plt.subplots(figsize= (8 , 6 ))
percentile_list = np.array([5 , 10 , 25 , 50 , 75 , 90 , 95 ])
data = df_girls.loc[20.0 ]
ax.plot(data, percentile_list, ls= '' , marker= 'o' , markersize= 6 , color= "black" )
ax.set (xlabel= 'height (cm)' ,
ylabel= 'percentile' ,
yticks= percentile_list,
title= "cdf for girls, age 20"
);
I suspect that the heights in the population are normally distributed. Let’s check that. I’ll fit the data to the integral of a gaussian, because the percentiles correspond to a cdf. If a pdf is a gaussian, its cumulative is given by
\Phi(x) = \frac{1}{2} \left( 1 + \text{erf}\left(\frac{x - \mu}{\sigma \sqrt{2}}\right) \right)
where \mu is the mean and \sigma is the standard deviation of the distribution. The error function \text{erf} is a sigmoid function, which is a good approximation for the cdf of the normal distribution.
define functions
def erf_model(x, mu, sigma):
return 50 * (1 + erf((x - mu) / (sigma * np.sqrt(2 ))) )
# initial guess for parameters: [mu, sigma]
p0 = [150 , 6 ]
# Calculate R-squared
def calculate_r2(y_true, y_pred):
ss_res = np.sum ((y_true - y_pred) ** 2 )
ss_tot = np.sum ((y_true - np.mean(y_true)) ** 2 )
return 1 - (ss_res / ss_tot)
fit model to data
data = df_girls.loc[20.0 ]
params, _ = curve_fit(erf_model, data, percentile_list, p0= p0,
bounds= ([100 , 3 ], # lower bounds for mu and sigma
[200 , 10 ]) # upper bounds for mu and sigma
)
# store the parameters in the dataframe
percentile_predicted = erf_model(data, * params)
# R-squared value
r2 = calculate_r2(percentile_list, percentile_predicted)
show results
fig, ax = plt.subplots(figsize= (8 , 6 ))
percentile_list = np.array([5 , 10 , 25 , 50 , 75 , 90 , 95 ])
data = df_girls.loc[20.0 ]
ax.plot(data, percentile_list, ls= '' , marker= 'o' , markersize= 6 , color= "black" , label= 'data' )
fit = erf_model(height_list, * params)
ax.plot(height_list, fit, label= 'fit' , color= "red" , linewidth= 2 )
ax.text(150 , 75 , f'$\mu$ = { params[0 ]:.1f} cm \n $\sigma$ = { params[1 ]:.1f} cm \n R$^2$ = { r2:.6f} ' ,
fontsize= 14 , bbox= dict (facecolor= 'white' , alpha= 0.5 ))
ax.legend(frameon= False )
ax.set (xlabel= 'height (cm)' ,
xlim= (140 , 190 ),
ylabel= 'percentile' ,
yticks= percentile_list,
title= "the data is very well fitted by a normal distribution"
);
Another way of making sure that the model fits the data is to make a QQ plot. In this plot, the quantiles of the data are plotted against the quantiles of the normal distribution. If the data is normally distributed, the points should fall on a straight line.
show QQ plot
fitted_quantiles = norm.cdf(data, loc= params[0 ], scale= params[1 ])
experimental_quantiles = percentile_list / 100
fig, ax = plt.subplots(figsize= (8 , 6 ))
ax.set_aspect('equal' , adjustable= 'box' )
ax.plot(experimental_quantiles, fitted_quantiles,
ls= '' , marker= 'o' , markersize= 6 , color= "black" ,
label= 'qq points' )
ax.plot([0 , 1 ], [0 , 1 ], color= 'red' , linewidth= 2 , label= "1:1 line" )
ax.set (xlabel= 'empirical quantiles' ,
ylabel= 'fitted quantiles' ,
xlim= (0 , 1 ),
ylim= (0 , 1 ),
title= "QQ plot" )
ax.legend(frameon= False )
Great, now we just need to do exactly the same for both sexes, and all the ages. I chose to divide age from 2 to 20 into 0.1 intervals.
create dataframes to store the parameters mu, sigma, r2
df_stats_boys = pd.DataFrame(index= age_list, columns= ['mu' , 'sigma' , 'r2' ])
df_stats_boys['mu' ] = 0.0
df_stats_boys['sigma' ] = 0.0
df_stats_boys['r2' ] = 0.0
df_stats_girls = pd.DataFrame(index= age_list, columns= ['mu' , 'sigma' , 'r2' ])
df_stats_girls['mu' ] = 0.0
df_stats_girls['sigma' ] = 0.0
df_stats_girls['r2' ] = 0.0
fit model to all the data
p0 = [80 , 3 ]
# loop over ages in the index, calculate mu and sigma
for i in df_boys.index:
# fit the model to the data
data = df_boys.loc[i]
params, _ = curve_fit(erf_model, data, percentile_list, p0= p0,
bounds= ([70 , 2 ], # lower bounds for mu and sigma
[200 , 10 ]) # upper bounds for mu and sigma
)
# store the parameters in the dataframe
df_stats_boys.at[i, 'mu' ] = params[0 ]
df_stats_boys.at[i, 'sigma' ] = params[1 ]
percentile_predicted = erf_model(data, * params)
# R-squared value
r2 = calculate_r2(percentile_list, percentile_predicted)
df_stats_boys.at[i, 'r2' ] = r2
p0 = params
# same for girls
p0 = [80 , 3 ]
for i in df_girls.index:
# fit the model to the data
data = df_girls.loc[i]
params, _ = curve_fit(erf_model, data, percentile_list, p0= p0,
bounds= ([70 , 3 ], # lower bounds for mu and sigma
[200 , 10 ]) # upper bounds for mu and sigma
)
# store the parameters in the dataframe
df_stats_girls.at[i, 'mu' ] = params[0 ]
df_stats_girls.at[i, 'sigma' ] = params[1 ]
percentile_predicted = erf_model(data, * params)
# R-squared value
r2 = calculate_r2(percentile_list, percentile_predicted)
df_stats_girls.at[i, 'r2' ] = r2
p0 = params
Let’s see what we got. The top panel in the graph shows the average height for boys and girls, the middle panel shows the coefficient of variation (\sigma/\mu ), and the bottom panel shows the R2 of the fit (note that the range is very close to 1).
2.0
86.463069
3.563785
0.999511
2.1
87.374895
3.596583
0.999676
2.2
88.269676
3.627433
0.999742
2.3
89.148086
3.657263
0.999752
2.4
90.010783
3.686764
0.999733
...
...
...
...
19.6
176.802810
7.134561
0.999991
19.7
176.845789
7.135786
0.999994
19.8
176.892196
7.137430
0.999995
19.9
176.942521
7.139466
0.999990
20.0
176.997255
7.141858
0.999976
181 rows × 3 columns
plot results
fig, ax = plt.subplots(3 ,1 , figsize= (8 , 10 ), sharex= True )
fig.subplots_adjust(left= 0.15 )
ax[0 ].plot(df_stats_boys['mu' ], label= 'boys' , lw= 2 )
ax[0 ].plot(df_stats_girls['mu' ], label= 'girls' , lw= 2 )
ax[0 ].legend(frameon= False )
ax[1 ].plot(df_stats_boys['sigma' ] / df_stats_boys['mu' ], lw= 2 )
ax[1 ].plot(df_stats_girls['sigma' ] / df_stats_girls['mu' ], lw= 2 )
ax[2 ].plot(df_stats_boys.index, df_stats_boys['r2' ], label= r'$r2$ boys' , lw= 2 )
ax[2 ].plot(df_stats_girls.index, df_stats_girls['r2' ], label= r'$r2$ girls' , lw= 2 )
ax[0 ].set (ylabel= 'average height (cm)' ,)
ax[1 ].set (ylabel= 'CV' ,
ylim= [0 ,0.055 ])
ax[2 ].set (xlabel= 'age (years)' ,
ylabel= r'$R^2$' ,
xticks= np.arange(2 , 21 , 2 ),
);
Let’s see how the pdfs for boys and girls move and morph as age increases.
produce dataframes for pre-calculated pdfs
age_list_string = age_list.astype(str ).tolist()
df_pdf_boys = pd.DataFrame(index= height_list, columns= age_list_string)
df_pdf_girls = pd.DataFrame(index= height_list, columns= age_list_string)
for age in df_pdf_boys.columns:
age_float = round (float (age), 1 )
df_pdf_boys[age] = norm.pdf(height_list,
loc= df_stats_boys.loc[age_float]['mu' ],
scale= df_stats_boys.loc[age_float]['sigma' ])
for age in df_pdf_girls.columns:
age_float = round (float (age), 1 )
df_pdf_girls[age] = norm.pdf(height_list,
loc= df_stats_girls.loc[age_float]['mu' ],
scale= df_stats_girls.loc[age_float]['sigma' ])
70.0
0.000006
2.962419e-06
1.229580e-06
4.740717e-07
1.893495e-07
7.928033e-08
3.395629e-08
1.454961e-08
6.214658e-09
2.698367e-09
...
3.876760e-46
4.998212e-46
6.108274e-46
6.965756e-46
7.300518e-46
6.928073e-46
5.866310e-46
4.367574e-46
2.817087e-46
1.550490e-46
70.1
0.000007
3.369929e-06
1.401926e-06
5.423176e-07
2.172465e-07
9.118694e-08
3.914667e-08
1.681357e-08
7.199311e-09
3.133161e-09
...
4.821662e-46
6.212999e-46
7.589544e-46
8.652519e-46
9.067461e-46
8.605908e-46
7.289698e-46
5.430839e-46
3.506265e-46
1.932327e-46
70.2
0.000008
3.830459e-06
1.597215e-06
6.199308e-07
2.490751e-07
1.048086e-07
4.509972e-08
1.941687e-08
8.334521e-09
3.635676e-09
...
5.995467e-46
7.721230e-46
9.427830e-46
1.074523e-45
1.125944e-45
1.068759e-45
9.056344e-46
6.751373e-46
4.363019e-46
2.407630e-46
70.3
0.000009
4.350475e-06
1.818328e-06
7.081296e-07
2.853621e-07
1.203810e-07
5.192270e-08
2.240831e-08
9.642428e-09
4.216078e-09
...
7.453283e-46
9.593350e-46
1.170864e-45
1.334099e-45
1.397806e-45
1.326973e-45
1.124851e-45
8.391039e-46
5.427845e-46
2.999137e-46
70.4
0.000010
4.937172e-06
2.068480e-06
8.082806e-07
3.267014e-07
1.381707e-07
5.973725e-08
2.584341e-08
1.114829e-08
4.885994e-09
...
9.263403e-46
1.191661e-45
1.453785e-45
1.655996e-45
1.734906e-45
1.647188e-45
1.396806e-45
1.042648e-45
6.750965e-46
3.735083e-46
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
219.5
0.000000
5.214425e-307
1.377605e-289
3.568527e-277
6.457994e-266
2.232144e-255
6.340272e-246
9.969867e-238
1.389324e-230
5.441854e-224
...
5.200570e-18
5.741874e-18
6.194642e-18
6.495054e-18
6.583545e-18
6.417949e-18
5.986319e-18
5.315101e-18
4.468701e-18
3.538724e-18
219.6
0.000000
1.813597e-307
5.050074e-290
1.356408e-277
2.537010e-266
9.046507e-256
2.642444e-246
4.256155e-238
6.055129e-231
2.417510e-224
...
4.558798e-18
5.035058e-18
5.433557e-18
5.698034e-18
5.775970e-18
5.630212e-18
5.250299e-18
4.659675e-18
3.915265e-18
3.097919e-18
219.7
0.000000
6.302763e-308
1.849870e-290
5.151948e-278
9.959447e-267
3.663840e-256
1.100546e-246
1.815751e-238
2.637298e-231
1.073274e-224
...
3.995288e-18
4.414220e-18
4.764871e-18
4.997654e-18
5.066279e-18
4.938013e-18
4.603699e-18
4.084117e-18
3.429566e-18
2.711382e-18
219.8
0.000000
2.188653e-308
6.771033e-291
1.955386e-278
3.906942e-267
1.482823e-256
4.580523e-247
7.741154e-239
1.147918e-231
4.761829e-225
...
3.500614e-18
3.869030e-18
4.177503e-18
4.382343e-18
4.442754e-18
4.329907e-18
4.035791e-18
3.578814e-18
3.003413e-18
2.372514e-18
219.9
0.000000
7.594139e-309
2.476504e-291
7.416066e-279
1.531537e-267
5.997065e-257
1.905138e-247
3.298116e-239
4.993198e-232
2.111339e-225
...
3.066470e-18
3.390384e-18
3.661688e-18
3.841895e-18
3.895062e-18
3.795805e-18
3.537115e-18
3.135297e-18
2.629596e-18
2.075507e-18
1500 rows × 181 columns
plotly widget
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook'
# create figure
fig = go.Figure()
# assume both dataframes have the same columns (ages) and index (height)
ages = df_pdf_boys.columns
x_vals = df_pdf_boys.index
# add traces: 2 per age (boys and girls), all hidden except the first pair
for i, age in enumerate (ages):
fig.add_trace(go.Scatter(x= x_vals, y= df_pdf_boys[age], name= f'Boys { age} ' ,
line= dict (color= '#1f77b4' ), visible= (i == 0 )))
fig.add_trace(go.Scatter(x= x_vals, y= df_pdf_girls[age], name= f'Girls { age} ' ,
line= dict (color= '#ff7f0e' ), visible= (i == 0 )))
# create slider steps
steps = []
for i, age in enumerate (ages):
vis = [False ] * (2 * len (ages))
vis[2 * i] = True # boys trace
vis[2 * i + 1 ] = True # girls trace
steps.append(dict (
method= 'update' ,
args= [{'visible' : vis},
{'title' : f'Height Distribution - Age: { age} ' }],
label= str (age)
))
# define slider
sliders = [dict (
active= 0 ,
currentvalue= {"prefix" : "Age: " },
pad= {"t" : 50 },
steps= steps
)]
# update layout
fig.update_layout(
sliders= sliders,
title= 'Height Distribution by Age' ,
xaxis_title= 'Height (cm)' ,
yaxis_title= 'Density' ,
yaxis= dict (range = [0 , 0.12 ]),
showlegend= True ,
height= 600 ,
width= 800
)
fig.show()